library(ggplot2)
library(maps)
library(ggrepel)
library(tidyverse)
library(dplyr)
library(tidyr)
library(corrplot)
library(gridExtra)
library(plotly)
library(factoextra)
library(psych)
library(GGally)
air=read.csv('indian_weather_data.csv')
head(air)
# making numerical df
num_df=air[c("lat","lon","temperature","weather_code","co","no2","o3","so2","pm2_5","pm10","wind_speed","wind_degree","pressure", "precip","humidity","cloudcover","feelslike","visibility" )]
num_df=data.frame(num_df)
# making categorical df
cat_df=air[c("city","sunrise","sunset","moonrise","moonset","wind_dir")]
colSums(is.na(air))
## city lat lon temperature weather_code sunrise
## 0 0 0 0 0 0
## sunset moonrise moonset co no2 o3
## 0 0 0 0 0 0
## so2 pm2_5 pm10 wind_speed wind_degree wind_dir
## 0 0 0 0 0 0
## pressure precip humidity cloudcover feelslike uv_index
## 0 0 0 0 0 0
## visibility
## 0
pca_fit <- prcomp(num_df, scale=TRUE)
pca_summary<-summary(pca_fit)
importance_matrix<-pca_summary$importance
# Convert to data frame
pca_df <- as.data.frame(importance_matrix)
pca_df
# Rotating the factors
rotation_df<-as.data.frame(pca_fit$rotation[,0:5])
rotation_df
fviz_pca_biplot(pca_fit,
repel = TRUE, # Prevents text overlapping
col.var = "blue", # Variables color
col.ind = "gray", # Individuals color
title = "PCA Biplot")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# making a correlation matrix
correlation_matrix<-cor(num_df)
#KMO Test
kmo_result <- KMO(correlation_matrix)
print(kmo_result)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = correlation_matrix)
## Overall MSA = 0.69
## MSA for each item =
## lat lon temperature weather_code co no2
## 0.85 0.54 0.72 0.65 0.83 0.35
## o3 so2 pm2_5 pm10 wind_speed wind_degree
## 0.82 0.78 0.75 0.74 0.56 0.34
## pressure precip humidity cloudcover feelslike visibility
## 0.79 0.35 0.56 0.81 0.69 0.49
# Bartlett's Test of Sphericity
cortest.bartlett(correlation_matrix, n = nrow(num_df))
## $chisq
## [1] 1873.867
##
## $p.value
## [1] 1.449223e-293
##
## $df
## [1] 153
scree(num_df, factors = FALSE, pc = TRUE,
main = "Scree Plot for Factor Analysis")
fa<- fa(num_df, nfactors=5,rotate="varimax",scores="regression")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
fa
## Factor Analysis using method = minres
## Call: fa(r = num_df, nfactors = 5, rotate = "varimax", scores = "regression")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 MR2 MR3 MR5 MR4 h2 u2 com
## lat 0.48 0.70 -0.21 -0.23 -0.12 0.840 0.1599 2.3
## lon -0.10 0.61 0.32 0.46 0.16 0.720 0.2796 2.7
## temperature -0.20 -0.97 -0.06 -0.08 0.03 0.985 0.0150 1.1
## weather_code 0.24 0.08 0.63 0.27 -0.10 0.543 0.4571 1.8
## co 0.93 0.21 0.15 0.02 -0.03 0.927 0.0731 1.2
## no2 0.45 -0.02 -0.03 0.00 0.66 0.632 0.3683 1.8
## o3 0.73 -0.10 0.23 -0.29 -0.13 0.693 0.3071 1.7
## so2 0.86 0.14 0.10 -0.09 0.16 0.810 0.1903 1.2
## pm2_5 0.94 0.18 0.18 -0.01 -0.09 0.950 0.0502 1.2
## pm10 0.93 0.18 0.19 -0.01 -0.10 0.936 0.0644 1.2
## wind_speed -0.39 -0.27 0.00 0.09 0.78 0.833 0.1668 1.8
## wind_degree -0.08 0.14 0.07 -0.08 0.20 0.074 0.9257 2.9
## pressure 0.19 0.87 0.10 -0.13 -0.01 0.812 0.1880 1.2
## precip -0.12 -0.14 0.14 0.96 -0.07 0.986 0.0140 1.1
## humidity 0.05 0.30 0.76 0.30 0.17 0.785 0.2154 1.8
## cloudcover 0.56 0.28 -0.35 0.16 -0.12 0.561 0.4392 2.5
## feelslike -0.14 -0.99 -0.01 0.01 0.04 1.005 -0.0049 1.0
## visibility -0.24 0.10 -0.79 0.19 -0.04 0.739 0.2611 1.4
##
## MR1 MR2 MR3 MR5 MR4
## SS loadings 5.01 3.97 2.07 1.55 1.23
## Proportion Var 0.28 0.22 0.11 0.09 0.07
## Cumulative Var 0.28 0.50 0.61 0.70 0.77
## Proportion Explained 0.36 0.29 0.15 0.11 0.09
## Cumulative Proportion 0.36 0.65 0.80 0.91 1.00
##
## Mean item complexity = 1.6
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 153 with the objective function = 28.32 with Chi Square = 1873.87
## df of the model are 73 and the objective function was 10.23
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.06
##
## The harmonic n.obs is 74 with the empirical chi square 38.28 with prob < 1
## The total n.obs was 74 with Likelihood Chi Square = 642.77 with prob < 4.8e-92
##
## Tucker Lewis Index of factoring reliability = 0.266
## RMSEA index = 0.324 and the 90 % confidence intervals are 0.304 0.35
## BIC = 328.58
## Fit based upon off diagonal values = 0.99
Factors are:
Factor 1: positive effect: co, o3, so2, pm2_5,pm10 negative effect: lon, temprature
Factor 2: positive effect: lat,lon,pressure negative effect: temperature,feelslike
Factor 3: positive effect: wether_code,humidity negative effect: visibility
Factor 4: positive effect: no2,wind_speed
Factor 5: positive effect: precip
Factor Analysis is used for identifying the underlying structure of the data. It helps in reducing variable and these obtained factors can be effectively used for EDA.
The First Factor can be named as pollutants, Second Factor as geographic_cond, Third Factor is weather_cond , Fourth can be named as ozone since higher wind speed decreases no2 concentration and fifth can be named precipitation
# Storing FA Scores as df
fa_scores<-as.data.frame(fa$scores)
# renaming column names
colnames(fa_scores)=c("pollutants","geographic_cond","weather_cond","ozone","precipitation")
# concatenating 2 dfs air and fa_scores
df<-cbind(air,fa_scores)
head(df)
df<-df %>%
mutate(weather_cat=case_when(
weather_code==113~"sunny",
weather_code==122~"partly cloudly",
weather_code==143~"mist",
weather_code==116~"moderate rain",
weather_code==119~"showers",
weather_code==248~"fog",
weather_code==176~"moderate rain",
)) %>%
mutate(visibility_cat=case_when(
visibility<1~"very poor",
between(visibility,1,3) ~"poor",
between(visibility,3,5) ~"moderate",
visibility>5 ~"good"
)) %>%
mutate(parts_of_India = case_when(
between(lat, 28, 37.6) & between(lon, 68.7, 97.25) ~ "North",
between(lat, 15, 28) & between(lon, 68, 78) ~ "West",
between(lat, 20, 28) & between(lon, 83, 97.25) ~ "East",
lat < 20 & between(lon, 74, 84) ~ "South",
between(lat, 18, 26) & between(lon, 74, 85) ~ "Central",
between(lat, 22, 28) & between(lon, 89, 97.25) ~ "Northeast",
TRUE ~ "Other Region"
))
# Calculate correlation with pairwise complete observations
weather_cor <- cor(num_df,
use = "pairwise.complete.obs")
c_color<- colorRampPalette(c("hotpink", "lightgreen","darkblue"))
corrplot(weather_cor,
method = "color",
title = "Weather Variables Correlation",
order="hclust",
col=c_color(10),
tl.col="black"
)
* Positive Correlation: pollutants are highly correlated such as pm2_5,
pm_10, co, so2 , no2 is moderately correlated, feelslike and
temperature
Moderate Correlation (positive and negative): We can see moderate correlation between co2 and weather code,longitude with temperature and feelslike,latitude with temprature and feelslike
Negative Correlation:pressure with feelslike and temprature
world<-map_data("world")
#getting the map for india
india<-subset(world,region=="India")
ggplot()+
geom_polygon(data=india,
aes(x=long,y=lat,group=group))+
geom_point(data=df,
aes(x=lon,y=lat,color=pm2_5))+
scale_color_continuous(
type = "viridis", # Or "gradient"
name = "pm 2.5"
)+
scale_x_log10()
# setting theme for all plots
set_theme(theme_minimal()+
theme(
plot.title=
element_text(
size=rel(2)),
panel.background =
element_rect(color="black"),
))
# pm10
df %>%
arrange(desc(pm10)) %>%
select(city,pm10,wind_dir) %>%
slice_head(n=20) %>%
ggplot(
aes(x=reorder(city,-pm10),y=pm10,fill=wind_dir)
)+
labs(
title="Highest pm10 Vs Cities and their wind direction",
x="Cities",
y="PM 10"
) +
geom_bar(stat="identity")+
theme(
axis.text.x =
element_text(angle=45,
hjust=1,
face="bold")
)
df %>%
arrange(desc(pm2_5)) %>%
select(city,pm2_5,wind_dir) %>%
slice_head(n=20) %>%
ggplot(
aes(x=reorder(city,-pm2_5),y=pm2_5,fill=wind_dir)
)+
geom_bar(stat="identity")+
labs(
title="Highest pm 2.5 Vs Cities and their wind direction",
x="Cities",
y="PM 2.5"
)+
theme(
axis.text.x =
element_text(angle=45,
hjust=1,
face="bold")
)
#One-on-one relationships between two continuous variables ##
Temperature vs PM2.5 levels
# temperature Vs pm2.5 levels
plot1<-df %>%
ggplot(aes(temperature,pm2_5,size=visibility))+
geom_point(color="orange")+
labs(
title="Temperature Vs pm 2.5",
subtitle="There influence on Visibility",
x="Temperature",
y="pm 2.5"
)+
theme(
aspect.ratio = 1
)
plot1
# temperature Vs pm2.5 levels
plot2<-df %>%
ggplot(aes(temperature,pressure,size=visibility))+
geom_point(color="pink")+
labs(
title="Temperature Vs Pressure",
subtitle="There influence on Visibility",
x="Temperature",
y="Pressure"
)+
theme(
aspect.ratio = 1
)
plot2
# Observing 3 variables Wind speed , Temprature and pm 2.5
concentration
fig <- plot_ly(df, x = ~wind_speed, y = ~temperature, z = ~pm2_5, color = ~city)
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(max(N, 3L), "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(N, 3L), "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Data for clustering
k_data<-df %>%
select("city","pollutants","weather_cond","geographic_cond")
fviz_nbclust(k_data[,c("pollutants","weather_cond","geographic_cond")], kmeans, method = "wss") +
ggtitle("Elbow Method")
fviz_nbclust(k_data[,c("pollutants","weather_cond","geographic_cond")], kmeans, method = "silhouette") +
ggtitle("Silhouette Method")
## Performing K-means clustering
# Perform k-means clustering (e.g., 4 clusters)
set.seed(123)
kmeans_result <- kmeans(k_data[,c("pollutants","weather_cond","geographic_cond")], centers = 4, nstart = 25)
print(kmeans_result)
## K-means clustering with 4 clusters of sizes 3, 13, 46, 12
##
## Cluster means:
## pollutants weather_cond geographic_cond
## 1 -1.2598530 -0.1369784 3.3428137
## 2 1.3139388 -0.9437692 0.3596657
## 3 -0.5321342 -0.1300377 -0.4335428
## 4 0.9313775 1.5551392 0.4365730
##
## Clustering vector:
## [1] 4 3 4 3 3 3 3 3 3 2 4 4 3 3 3 4 4 4 3 2 2 4 4 4 4 2 2 2 4 2 2 2 1 1 3 3 3 3
## [39] 3 3 2 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 1 3 3
##
## Within cluster sum of squares by cluster:
## [1] 9.656581 6.247595 44.417537 16.614590
## (between_SS / total_SS = 64.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
k_data$cluster <- as.factor(kmeans_result$cluster)
k_data %>%
mutate(cluster_name = case_when(
cluster == 1 ~ "Extremely Clean, Very Warm",
cluster== 2 ~ "Outlier City",
cluster== 3 ~ "Clean, Bad Weather",
cluster== 4 ~ "Very Polluted, Bad Weather",
cluster== 5 ~ "Polluted, Good Weather",
TRUE ~ "Unknown"
))
city_names<-k_data$city
fviz_cluster(kmeans_result, data = k_data[,c("pollutants","weather_cond","geographic_cond")],
palette = "Set2", ggtheme = theme_minimal(),
geom = "point") +
geom_text(aes(label = city_names),
check_overlap = TRUE,
size = 3,
vjust = -0.5)+
labs(
title="Clusters Visualization"
)+
theme(
plot.title =
element_text(face = "bold",
size=rel(2)),
panel.background =
element_rect(color="black")
)+
scale_fill_discrete(labels = c("A", "B", "C","D","E"))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.